# load generated data
hour_data <- read_csv("hour_data.csv")
# The grouped data does not have the original date column, but it can be nice for visualizations. There is a function to do this in times.R
hour_data_date <- add_date_column(hour_data)
We are performing exploratory data analysis on the training data only.
set.seed(28021995)
# create a split object
data_split <- initial_split(hour_data_date, prop = 0.8)
# create the training and testing data
train_data_date <- training(x = data_split)
test_data_date <- testing(x = data_split)
# summary entire train dataset
summary(train_data_date)
## year month day hour
## Min. :2020 Length:9944 Min. : 1.00 Min. : 0.00
## 1st Qu.:2020 Class :character 1st Qu.: 8.00 1st Qu.: 6.00
## Median :2021 Mode :character Median :16.00 Median :12.00
## Mean :2021 Mean :15.71 Mean :11.53
## 3rd Qu.:2021 3rd Qu.:23.00 3rd Qu.:18.00
## Max. :2021 Max. :31.00 Max. :23.00
##
## 20th_&_e_st_nw_arrivals new_hampshire_ave_&_24th_st_nw_arrivals
## Min. : 0.0000 Min. : 0.000
## 1st Qu.: 0.0000 1st Qu.: 0.000
## Median : 0.0000 Median : 1.000
## Mean : 0.5535 Mean : 1.282
## 3rd Qu.: 1.0000 3rd Qu.: 2.000
## Max. :20.0000 Max. :13.000
##
## 15th_st_&_constitution_ave_nw_departures 21st_&_i_st_nw_arrivals
## Min. : 0.000 Min. : 0.000
## 1st Qu.: 0.000 1st Qu.: 0.000
## Median : 0.000 Median : 0.000
## Mean : 1.786 Mean : 0.774
## 3rd Qu.: 2.000 3rd Qu.: 1.000
## Max. :36.000 Max. :23.000
##
## 17th_&_g_st_nw_arrivals 19th_st_&_constitution_ave_nw_departures
## Min. : 0.0000 Min. : 0.0000
## 1st Qu.: 0.0000 1st Qu.: 0.0000
## Median : 0.0000 Median : 0.0000
## Mean : 0.8188 Mean : 0.7992
## 3rd Qu.: 1.0000 3rd Qu.: 1.0000
## Max. :21.0000 Max. :33.0000
##
## 23rd_&_e_st_nw_arrivals henry_bacon_dr_&_lincoln_memorial_circle_nw_arrivals
## Min. : 0.0000 Min. : 0.000
## 1st Qu.: 0.0000 1st Qu.: 0.000
## Median : 0.0000 Median : 1.000
## Mean : 0.6858 Mean : 2.617
## 3rd Qu.: 1.0000 3rd Qu.: 4.000
## Max. :14.0000 Max. :47.000
##
## henry_bacon_dr_&_lincoln_memorial_circle_nw_departures
## Min. : 0.000
## 1st Qu.: 0.000
## Median : 1.000
## Mean : 2.629
## 3rd Qu.: 4.000
## Max. :46.000
##
## jefferson_dr_&_14th_st_sw_departures jefferson_memorial_arrivals
## Min. : 0.000 Min. : 0.000
## 1st Qu.: 0.000 1st Qu.: 0.000
## Median : 1.000 Median : 0.000
## Mean : 2.928 Mean : 1.788
## 3rd Qu.: 4.000 3rd Qu.: 2.000
## Max. :44.000 Max. :40.000
##
## jefferson_memorial_departures 18th_st_&_pennsylvania_ave_nw_arrivals
## Min. : 0.000 Min. : 0.0000
## 1st Qu.: 0.000 1st Qu.: 0.0000
## Median : 0.000 Median : 0.0000
## Mean : 1.798 Mean : 0.5459
## 3rd Qu.: 2.000 3rd Qu.: 1.0000
## Max. :42.000 Max. :19.0000
##
## 19th_st_&_constitution_ave_nw_arrivals
## Min. : 0.0000
## 1st Qu.: 0.0000
## Median : 0.0000
## Mean : 0.8371
## 3rd Qu.: 1.0000
## Max. :33.0000
##
## us_dept_of_state_/_virginia_ave_&_21st_st_nw_arrivals
## Min. : 0.0000
## 1st Qu.: 0.0000
## Median : 0.0000
## Mean : 0.4419
## 3rd Qu.: 0.0000
## Max. :16.0000
##
## virginia_ave_&_25th_st_nw_arrivals virginia_ave_&_25th_st_nw_departures
## Min. : 0.0000 Min. : 0.0000
## 1st Qu.: 0.0000 1st Qu.: 0.0000
## Median : 0.0000 Median : 0.0000
## Mean : 0.6104 Mean : 0.5786
## 3rd Qu.: 1.0000 3rd Qu.: 1.0000
## Max. :12.0000 Max. :11.0000
##
## 19th_&_g_st_nw_arrivals 19th_st_&_pennsylvania_ave_nw_departures
## Min. : 0.0000 Min. :0.0000
## 1st Qu.: 0.0000 1st Qu.:0.0000
## Median : 0.0000 Median :0.0000
## Mean : 0.2494 Mean :0.3012
## 3rd Qu.: 0.0000 3rd Qu.:0.0000
## Max. :11.0000 Max. :9.0000
##
## 22nd_st_&_constitution_ave_nw_arrivals 22nd_&_h_st_nw_arrivals
## Min. : 0.000 Min. : 0.0000
## 1st Qu.: 0.000 1st Qu.: 0.0000
## Median : 0.000 Median : 0.0000
## Mean : 0.556 Mean : 0.5047
## 3rd Qu.: 1.000 3rd Qu.: 1.0000
## Max. :37.000 Max. :11.0000
##
## 22nd_&_h_st_nw_departures 17th_&_g_st_nw_departures 19th_&_g_st_nw_departures
## Min. : 0.0000 Min. : 0.000 Min. : 0.0000
## 1st Qu.: 0.0000 1st Qu.: 0.000 1st Qu.: 0.0000
## Median : 0.0000 Median : 0.000 Median : 0.0000
## Mean : 0.4967 Mean : 0.761 Mean : 0.2508
## 3rd Qu.: 1.0000 3rd Qu.: 1.000 3rd Qu.: 0.0000
## Max. :12.0000 Max. :29.000 Max. :15.0000
##
## 21st_st_&_constitution_ave_nw_arrivals
## Min. : 0.0000
## 1st Qu.: 0.0000
## Median : 0.0000
## Mean : 0.4718
## 3rd Qu.: 0.0000
## Max. :18.0000
##
## 21st_st_&_constitution_ave_nw_departures
## Min. : 0.0000
## 1st Qu.: 0.0000
## Median : 0.0000
## Mean : 0.4658
## 3rd Qu.: 0.0000
## Max. :17.0000
##
## 21st_st_&_pennsylvania_ave_nw_arrivals
## Min. : 0.0000
## 1st Qu.: 0.0000
## Median : 0.0000
## Mean : 0.4555
## 3rd Qu.: 1.0000
## Max. :10.0000
##
## 21st_st_&_pennsylvania_ave_nw_departures
## Min. : 0.0000
## 1st Qu.: 0.0000
## Median : 0.0000
## Mean : 0.3819
## 3rd Qu.: 0.0000
## Max. :12.0000
##
## 22nd_&_i_st_nw_/_foggy_bottom_arrivals
## Min. : 0.000
## 1st Qu.: 0.000
## Median : 1.000
## Mean : 1.193
## 3rd Qu.: 2.000
## Max. :16.000
##
## 22nd_&_i_st_nw_/_foggy_bottom_departures kennedy_center_arrivals
## Min. : 0.000 Min. : 0.0000
## 1st Qu.: 0.000 1st Qu.: 0.0000
## Median : 0.000 Median : 0.0000
## Mean : 1.189 Mean : 0.5179
## 3rd Qu.: 2.000 3rd Qu.: 1.0000
## Max. :14.000 Max. :16.0000
##
## kennedy_center_departures 18th_st_&_pennsylvania_ave_nw_departures
## Min. : 0.0000 Min. : 0.000
## 1st Qu.: 0.0000 1st Qu.: 0.000
## Median : 0.0000 Median : 0.000
## Mean : 0.4896 Mean : 0.518
## 3rd Qu.: 0.0000 3rd Qu.: 1.000
## Max. :14.0000 Max. :24.000
##
## 22nd_st_&_constitution_ave_nw_departures 23rd_&_e_st_nw_departures
## Min. : 0.0000 Min. : 0.0000
## 1st Qu.: 0.0000 1st Qu.: 0.0000
## Median : 0.0000 Median : 0.0000
## Mean : 0.5878 Mean : 0.7135
## 3rd Qu.: 1.0000 3rd Qu.: 1.0000
## Max. :33.0000 Max. :13.0000
##
## lincoln_memorial_departures 18th_&_c_st_nw_departures
## Min. : 0.000 Min. : 0.0000
## 1st Qu.: 0.000 1st Qu.: 0.0000
## Median : 1.000 Median : 0.0000
## Mean : 3.378 Mean : 0.3965
## 3rd Qu.: 5.000 3rd Qu.: 0.0000
## Max. :58.000 Max. :30.0000
##
## 21st_&_i_st_nw_departures new_hampshire_ave_&_24th_st_nw_departures
## Min. : 0.0000 Min. : 0.000
## 1st Qu.: 0.0000 1st Qu.: 0.000
## Median : 0.0000 Median : 1.000
## Mean : 0.7431 Mean : 1.194
## 3rd Qu.: 1.0000 3rd Qu.: 2.000
## Max. :20.0000 Max. :12.000
##
## ohio_dr_&_west_basin_dr_sw_/_mlk_&_fdr_memorials_arrivals
## Min. : 0.000
## 1st Qu.: 0.000
## Median : 0.000
## Mean : 1.328
## 3rd Qu.: 2.000
## Max. :46.000
##
## ohio_dr_&_west_basin_dr_sw_/_mlk_&_fdr_memorials_departures
## Min. : 0.000
## 1st Qu.: 0.000
## Median : 0.000
## Mean : 1.336
## 3rd Qu.: 2.000
## Max. :51.000
##
## us_dept_of_state_/_virginia_ave_&_21st_st_nw_departures
## Min. : 0.0000
## 1st Qu.: 0.0000
## Median : 0.0000
## Mean : 0.4301
## 3rd Qu.: 0.0000
## Max. :16.0000
##
## 15th_st_&_constitution_ave_nw_arrivals 17th_st_&_independence_ave_sw_arrivals
## Min. : 0.000 Min. : 0.000
## 1st Qu.: 0.000 1st Qu.: 0.000
## Median : 0.000 Median : 0.000
## Mean : 1.966 Mean : 1.686
## 3rd Qu.: 3.000 3rd Qu.: 2.000
## Max. :37.000 Max. :45.000
##
## 18th_&_c_st_nw_arrivals 19th_st_&_pennsylvania_ave_nw_arrivals
## Min. : 0.0000 Min. : 0.0000
## 1st Qu.: 0.0000 1st Qu.: 0.0000
## Median : 0.0000 Median : 0.0000
## Mean : 0.4145 Mean : 0.3001
## 3rd Qu.: 0.0000 3rd Qu.: 0.0000
## Max. :17.0000 Max. :10.0000
##
## 19th_&_e_street_nw_departures jefferson_dr_&_14th_st_sw_arrivals
## Min. : 0.00000 Min. : 0.000
## 1st Qu.: 0.00000 1st Qu.: 0.000
## Median : 0.00000 Median : 1.000
## Mean : 0.09141 Mean : 2.947
## 3rd Qu.: 0.00000 3rd Qu.: 4.000
## Max. :10.00000 Max. :44.000
##
## lincoln_memorial_arrivals 20th_&_e_st_nw_departures
## Min. : 0.000 Min. : 0.0000
## 1st Qu.: 0.000 1st Qu.: 0.0000
## Median : 1.000 Median : 0.0000
## Mean : 3.153 Mean : 0.5774
## 3rd Qu.: 4.000 3rd Qu.: 1.0000
## Max. :49.000 Max. :18.0000
##
## 17th_st_&_independence_ave_sw_departures 19th_&_e_street_nw_arrivals
## Min. : 0.00 Min. :0.00000
## 1st Qu.: 0.00 1st Qu.:0.00000
## Median : 0.00 Median :0.00000
## Mean : 1.74 Mean :0.09222
## 3rd Qu.: 2.00 3rd Qu.:0.00000
## Max. :44.00 Max. :9.00000
##
## 20th_st_&_virginia_ave_nw_arrivals 20th_st_&_virginia_ave_nw_departures
## Min. : 0.0000 Min. : 0.0000
## 1st Qu.: 0.0000 1st Qu.: 0.0000
## Median : 0.0000 Median : 0.0000
## Mean : 0.2013 Mean : 0.1833
## 3rd Qu.: 0.0000 3rd Qu.: 0.0000
## Max. :15.0000 Max. :16.0000
##
## 19th_&_k_st_nw_arrivals 19th_&_k_st_nw_departures
## Min. :0.000000 Min. :0.000000
## 1st Qu.:0.000000 1st Qu.:0.000000
## Median :0.000000 Median :0.000000
## Mean :0.002615 Mean :0.002514
## 3rd Qu.:0.000000 3rd Qu.:0.000000
## Max. :1.000000 Max. :1.000000
##
## crystal_dr_&_23rd_st_s_arrivals
## Min. :0.0000000
## 1st Qu.:0.0000000
## Median :0.0000000
## Mean :0.0001006
## 3rd Qu.:0.0000000
## Max. :1.0000000
##
## smithsonian-national_mall_/_jefferson_dr_&_12th_st_sw_arrivals
## Min. :0.0000000
## 1st Qu.:0.0000000
## Median :0.0000000
## Mean :0.0001006
## 3rd Qu.:0.0000000
## Max. :1.0000000
##
## 7th_&_t_st_nw_arrivals crystal_dr_&_15th_st_s_arrivals
## Min. :0 Min. :0.0000000
## 1st Qu.:0 1st Qu.:0.0000000
## Median :0 Median :0.0000000
## Mean :0 Mean :0.0001006
## 3rd Qu.:0 3rd Qu.:0.0000000
## Max. :0 Max. :1.0000000
##
## 20th_&_o_st_nw_/_dupont_south_arrivals eads_st_&_12th_st_s_arrivals
## Min. :0 Min. :0.0000000
## 1st Qu.:0 1st Qu.:0.0000000
## Median :0 Median :0.0000000
## Mean :0 Mean :0.0001006
## 3rd Qu.:0 3rd Qu.:0.0000000
## Max. :0 Max. :1.0000000
##
## iwo_jima_memorial_/_meade_&_14th_st_n_arrivals potomac_&_m_st_nw_arrivals
## Min. :0.0000000 Min. :0.0000000
## 1st Qu.:0.0000000 1st Qu.:0.0000000
## Median :0.0000000 Median :0.0000000
## Mean :0.0001006 Mean :0.0003017
## 3rd Qu.:0.0000000 3rd Qu.:0.0000000
## Max. :1.0000000 Max. :2.0000000
##
## gravelly_point_arrivals 21st_st_&_g_st_nw_arrivals
## Min. :0.0000000 Min. : 0.0000
## 1st Qu.:0.0000000 1st Qu.: 0.0000
## Median :0.0000000 Median : 0.0000
## Mean :0.0001006 Mean : 0.4806
## 3rd Qu.:0.0000000 3rd Qu.: 0.0000
## Max. :1.0000000 Max. :18.0000
##
## 21st_st_&_g_st_nw_departures maine_ave_&_9th_st_sw_arrivals
## Min. : 0.0000 Min. :0.0000000
## 1st Qu.: 0.0000 1st Qu.:0.0000000
## Median : 0.0000 Median :0.0000000
## Mean : 0.4757 Mean :0.0002011
## 3rd Qu.: 0.0000 3rd Qu.:0.0000000
## Max. :15.0000 Max. :1.0000000
##
## roosevelt_island_arrivals n_lynn_st_&_fairfax_dr_arrivals
## Min. :0.0000000 Min. :0
## 1st Qu.:0.0000000 1st Qu.:0
## Median :0.0000000 Median :0
## Mean :0.0001006 Mean :0
## 3rd Qu.:0.0000000 3rd Qu.:0
## Max. :1.0000000 Max. :0
##
## 4th_st_&_g_st_sw_arrivals 20th_&_l_st_nw_arrivals
## Min. :0.0000000 Min. :0.0000000
## 1st Qu.:0.0000000 1st Qu.:0.0000000
## Median :0.0000000 Median :0.0000000
## Mean :0.0001006 Mean :0.0001006
## 3rd Qu.:0.0000000 3rd Qu.:0.0000000
## Max. :1.0000000 Max. :1.0000000
##
## 15th_st_&_pennsylvania_ave_nw_arrivals
## Min. :0.0000000
## 1st Qu.:0.0000000
## Median :0.0000000
## Mean :0.0001006
## 3rd Qu.:0.0000000
## Max. :1.0000000
##
## woodley_park_metro_/_calvert_st_&_connecticut_ave_nw_arrivals
## Min. :0.0000000
## 1st Qu.:0.0000000
## Median :0.0000000
## Mean :0.0001006
## 3rd Qu.:0.0000000
## Max. :1.0000000
##
## 15th_&_p_st_nw_arrivals reservoir_rd_&_38th_st_nw_arrivals
## Min. :0.0000000 Min. :0.0000000
## 1st Qu.:0.0000000 1st Qu.:0.0000000
## Median :0.0000000 Median :0.0000000
## Mean :0.0001006 Mean :0.0001006
## 3rd Qu.:0.0000000 3rd Qu.:0.0000000
## Max. :1.0000000 Max. :1.0000000
##
## 3000_connecticut_ave_nw_/_national_zoo_arrivals white_house_departures
## Min. :0.0000000 Min. :0.000000
## 1st Qu.:0.0000000 1st Qu.:0.000000
## Median :0.0000000 Median :0.000000
## Mean :0.0001006 Mean :0.002011
## 3rd Qu.:0.0000000 3rd Qu.:0.000000
## Max. :1.0000000 Max. :3.000000
##
## white_house_arrivals stadium_armory_metro_arrivals maximum_temperature
## Min. :0.000000 Min. :0.0000000 Min. :23.10
## 1st Qu.:0.000000 1st Qu.:0.0000000 1st Qu.:50.85
## Median :0.000000 Median :0.0000000 Median :66.80
## Mean :0.002212 Mean :0.0001006 Mean :63.73
## 3rd Qu.:0.000000 3rd Qu.:0.0000000 3rd Qu.:76.80
## Max. :3.000000 Max. :1.0000000 Max. :98.00
## NA's :1
## minimum_temperature temperature wind_chill heat_index
## Min. :23.10 Min. :23.10 Min. :10.60 Min. : 79.3
## 1st Qu.:50.85 1st Qu.:50.85 1st Qu.:27.62 1st Qu.: 84.9
## Median :66.80 Median :66.80 Median :34.00 Median : 88.8
## Mean :63.73 Mean :63.73 Mean :33.72 Mean : 89.4
## 3rd Qu.:76.80 3rd Qu.:76.80 3rd Qu.:40.70 3rd Qu.: 93.2
## Max. :98.00 Max. :98.00 Max. :49.40 Max. :108.3
## NA's :1 NA's :1 NA's :7790 NA's :8316
## precipitation snow snow_depth wind_speed
## Min. :0.000000 Min. :0.0000000 Min. :0.0000 Min. : 0.000
## 1st Qu.:0.000000 1st Qu.:0.0000000 1st Qu.:0.0000 1st Qu.: 4.800
## Median :0.000000 Median :0.0000000 Median :0.0000 Median : 7.600
## Mean :0.006046 Mean :0.0008609 Mean :0.0316 Mean : 8.043
## 3rd Qu.:0.000000 3rd Qu.:0.0000000 3rd Qu.:0.0000 3rd Qu.:10.700
## Max. :1.030000 Max. :1.1300000 Max. :2.9000 Max. :36.400
## NA's :1 NA's :1 NA's :1 NA's :1
## wind_direction wind_gust visibility cloud_cover
## Min. : 0.0 Min. :16.10 Min. : 0.000 Min. : 0.00
## 1st Qu.:107.0 1st Qu.:20.80 1st Qu.: 9.900 1st Qu.: 33.10
## Median :191.0 Median :24.15 Median : 9.900 Median : 69.70
## Mean :191.2 Mean :25.36 Mean : 9.491 Mean : 61.72
## 3rd Qu.:301.0 3rd Qu.:29.60 3rd Qu.: 9.900 3rd Qu.: 89.40
## Max. :360.0 Max. :57.50 Max. :10.000 Max. :100.00
## NA's :3 NA's :8334 NA's :1 NA's :1
## relative_humidity conditions sun_is_out
## Min. : 14.19 Length:9944 Min. :0.0000
## 1st Qu.: 50.60 Class :character 1st Qu.:0.0000
## Median : 66.71 Mode :character Median :1.0000
## Mean : 65.29 Mean :0.5707
## 3rd Qu.: 81.11 3rd Qu.:1.0000
## Max. :100.00 Max. :1.0000
## NA's :1
## date
## Min. :2020-05-01 01:00:00
## 1st Qu.:2020-09-08 07:45:00
## Median :2021-01-14 10:30:00
## Mean :2021-01-14 15:02:20
## 3rd Qu.:2021-05-24 00:15:00
## Max. :2021-09-30 23:00:00
##
# summary lincoln_memorial_departures
train_data_date %>%
summarize(
mean(lincoln_memorial_departures),
max(lincoln_memorial_departures),
min(lincoln_memorial_departures),
sd(lincoln_memorial_departures))
## # A tibble: 1 × 4
## `mean(lincoln_memo… `max(lincoln_memor… `min(lincoln_memor… `sd(lincoln_memor…
## <dbl> <dbl> <dbl> <dbl>
## 1 3.38 58 0 5.38
# how can we get better summary stats? maybe not split by hour?
# we can group the data, and sum if we need to
# check how much NA in temperature variable
# sum(is.na(train_data_date$temperature)) #0
# sum(is.na(train_data_date$cloud_cover)) #0
train_data_date %>%
# mutate(weekday = wday(date, label=TRUE)) %>%
group_by(month) %>%
summarize(mean(temperature), max(temperature), min(temperature), sd(temperature))
## # A tibble: 12 × 5
## month `mean(temperature)` `max(temperature… `min(temperatur… `sd(temperature…
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 Apr 57.8 85.7 30.7 10.6
## 2 Aug 78.8 96 65.5 5.85
## 3 Dec 40.7 65.9 23.1 7.58
## 4 Feb 37.2 59.4 24.8 7.40
## 5 Jan 38.1 56.9 23.7 6.10
## 6 Jul 81.5 98 62.9 6.56
## 7 Jun 76.1 93.9 55.3 7.47
## 8 Mar NA NA NA NA
## 9 May 64.4 92.7 37 10.2
## 10 Nov 53.3 74.3 31.5 8.23
## 11 Oct 61.0 80.1 40.8 7.55
## 12 Sep 71.1 89.9 46.7 7.67
# density plot (no log)
train_data_date %>%
filter(lincoln_memorial_departures != 0) %>% # notice that we here are dropping 0s
ggplot(aes(x = lincoln_memorial_departures)) +
geom_density() +
theme_minimal() +
labs(title = "xxx",
x = "Number of departures from Lincoln Memorial")
# What does the number 50 entail?
# density plot (log)
train_data_date %>%
filter(lincoln_memorial_departures != 0) %>% # notice that we here are dropping 0s
ggplot(aes(x = log(lincoln_memorial_departures))) + # add log to fix skewness
geom_density() +
theme_minimal() +
labs(title = "xxx",
x = "Number of departures from Lincoln Memorial (logged)")
# What does the number 4 entail?
In this part, we use Lincoln Memorial as an example to show the change in amount of rides over time.
train_data_date %>% # for me here there is a weird gap after July 2020, not sure why?
ggplot(aes(x = date, y = lincoln_memorial_departures)) +
geom_point(alpha = 0.1, color = "orange") +
geom_smooth() +
labs(title = "January has the least demand of capital bikes",
x = "Date",
y = "The Number of Departures at Lincoln Memorial")
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
# by weekday
train_data_date %>%
mutate(weekday = wday(date, label=TRUE)) %>%
ggplot(aes(x = weekday, y = lincoln_memorial_departures)) +
geom_point(alpha = 0.1) + # clearly people cycle more on Saturdays and Sundays
geom_smooth() +
labs(title = "At Lincoln Memorial, Capitals bikes are mostly used for recreational purpose",
subtitle = "Weekend has the highest demand",
x = "Weekday (Aggregate)",
y = "The Number of Departures at Lincoln Memorial")
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
# farah's code here although similar to first one here?
#hour_data_date %>%
#mutate(weekday = wday(date, label=TRUE)) %>%
#ggplot(aes(x = weekday, y = lincoln_memorial)) +
#geom_point(alpha = 0.25) +
#labs(title = "Capital bikeshare Ridership on Week days") +
#theme_minimal()
# by day of the month
train_data_date %>%
ggplot(aes(x = day, y = lincoln_memorial_departures)) +
geom_point(alpha = 0.1, color = "salmon") + # there is no clear pattern across days of the month - aka probably not a good predictor
geom_smooth() +
facet_wrap(~month, scales = "free_y") +
labs(title = "At Lincoln Memorial, demand for bikes is more fluctuated in tourist season",
x = "Day of a month",
y = "The Number of Departures at Lincoln Memorial")
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
# by hour
train_data_date %>%
ggplot(aes(x = factor(hour), y = lincoln_memorial_departures)) +
geom_col(fill = "deepskyblue2", alpha = 0.8) +
geom_smooth() +
theme_minimal() +
labs(title = "At Lincoln Memorial, bikes are most demanded in the afternoon from 12pm-6pm",
subtitle = "The highest demand occurs at 2pm",
x = "Hour (Aggregate)",
y = "The Number of Departures at Lincoln Memorial")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
# for a specific month and day, seeing variance by hour
train_data_date %>%
filter(year == 2021, month == "Aug", day == 15) %>%
ggplot(aes(x = factor(hour), y = lincoln_memorial_departures)) +
geom_col(fill = "Red", alpha = 0.5) +
theme_minimal() +
labs(title = "On Aug 15, the highest demand occured at 12pm and 2pm",
x = "Hour",
y = "The Number of Departures at Lincoln Memorial")
# by hour & month
train_data_date %>%
ggplot(aes(x = hour, y = lincoln_memorial_departures)) +
geom_point(alpha = 0.1, color = "dark green") + # people cycle more in the middle of the day, like between 1 and 7PM
geom_smooth() +
facet_wrap(~month)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
# add month order
order_month <- c("Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec")
# by month
train_data_date %>%
ggplot(aes(x = month, y = lincoln_memorial_departures), level = order_month) +
geom_point(alpha = 0.1, color = "dark red") +
scale_x_discrete(limits = month.abb) +
# people cycle most in Spring, summer and fall. Surprised that there is no big spike in summer months
geom_smooth() +
theme_minimal() +
labs(title = "At Lincoln Memorial, people cycle the most in Spring",
x = "Month (Aggregate)",
y = "The Number of Departures at Lincoln Memorial")
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
# by year
train_data_date %>%
ggplot(aes(x = factor(year), y = lincoln_memorial_departures)) +
geom_col(width = 0.3, color = "pink") +
geom_smooth() +
theme_minimal() +
labs(title = "At Lincoln Memorial, more people use the bike in 2021 than in 2020",
x = "Year (Aggregate)",
y = "The Number of Departures at Lincoln Memorial")
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
#prep training data for holiday barchart - 1 year of data from May 2020 to April 2021
bar_date <- train_data_date %>%
filter(year==2020 | year==2021 & month== "Jan" | year==2021 & month== "Feb"|year==2021 & month== "Mar"|year==2021 & month== "Apr") %>%
group_by(day, month, year)%>%
summarise(lincoln_memorial_departures=sum(lincoln_memorial_departures), n())
## `summarise()` has grouped output by 'day', 'month'. You can override using the `.groups` argument.
## confirmed: 365 days exist in this data frame but not everyday has 24 hours because the training data does not have the full data
# check for missing data
sum(is.na(bar_date$lincoln_memorial_departures)) #0
## [1] 0
# what is the mean departures per day?
mean_day <- bar_date %>%
group_by(day) %>%
summarize(n= mean(lincoln_memorial_departures)) %>%
summarize(mean(n)) ## mean per day is 55.6
# create a data frame for holidays
holidays <- generated_holidays %>%
filter(country == "US")
holidays$ds <- as_date(ymd(holidays$ds))
holidays <- holidays %>%
mutate(
day = day(ymd(ds)),
month = month(ymd(ds), label = TRUE)
) %>%
filter(year==2020 | year==2021)
holidays_date <- left_join(bar_date, holidays, by= c('year', 'month', 'day')) %>%
na.omit(holidays)
holidays_date %>%
ggplot(aes(y= holiday, x= lincoln_memorial_departures , fill=lincoln_memorial_departures)) +
geom_col()+
geom_vline(xintercept = 55.6, color = "red") +
xlab("") +
ylab("Number of departures") +
scale_fill_gradient(low = "#FFFFFFFF",
high = "#012345") +
#scale_fill_gradient2() +
#scale_colour_discrete("Holidays")+
#scale_fill_manual(c("#FF6666")) +
guides(fill=guide_legend(title="Number of departures")) +
theme_bw() +
labs(title = "Demand is highest among holidays such as Independence Day, Labor Day and Memorial Day",
x = "",
y = "The number of departures from Licoln Memorial")
ggsave("holidaybar.jpg", width = 50, height = 20, units = "cm")
In this part, we visualize the demand of bikes at the Lincoln Memorial station by different whether conditions.
# weather viz
train_data_date %>%
ggplot(aes(x = temperature, y = lincoln_memorial_departures))+
geom_point(alpha = 0.1,color = "orange") +
labs(x = 'Temperature', y = 'Hourly Departures and Arrivals')+
geom_smooth() +
theme_minimal() +
labs(title = "As the temperature goes up, departure also goes up",
subtitle = "There is a linear relationship between demand and temperature",
x = "Temperature (Fahrenheit)",
y = "Aggregate number of departures at Lincoln Memorial")
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
## Warning: Removed 1 rows containing missing values (geom_point).
# temperature in each month
train_data_date %>%
ggplot(aes(x = temperature, y = lincoln_memorial_departures))+
geom_point(alpha = 0.07, color = "orange") +
geom_smooth() +
facet_wrap(~month) +
theme_light() +
labs(title = "As the temperature goes up, demand for bikes also goes up",
subtitle = "There is a rough linear relationship between temperature demand except August",
x = "Temperature (Fahrenheit)",
y = "Aggregate number of departures at Lincoln Memorial")
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
# wind viz
train_data_date %>%
ggplot(aes(x = wind_speed,y = lincoln_memorial_departures))+
geom_point(alpha = 0.1,color = "deepskyblue2") +
geom_smooth() + # data is sparse hence why we get this weird trend
theme_minimal() +
labs(title = "Demand for bikes is not sensitive to wind speed",
subtitle = "No clear relationship is shown between wind speed and bike demand",
x = "Wind Speed (miles per minute)",
y = "Aggregate number of departures at Lincoln Memorial")
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
# rain viz
train_data_date %>%
filter(precipitation != 0) %>%
#removing 0 to better show relationship
ggplot(aes(x = precipitation, y = lincoln_memorial_departures))+
geom_point(alpha = 0.1,color = "blue") +
geom_smooth() +
theme_minimal() +
labs(title = "Negative relationship between bike demand and precipitation",
subtitle = "No clear relationship is shown between wind speed and bike demand",
x = "Precipitation (include drizzling, rain, sleet, snow, ice pellets, graupel and hail)",
y = "Aggregate number of departures at Lincoln Memorial")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
# data very sparse hence why weird trend. Aaron recommended also creating a variable that dychotomizes rain. 1 if it rains 0 if it doesn't.
#cloud viz
train_data_date %>%
ggplot(aes(x = cloud_cover,y = lincoln_memorial_departures)) +
geom_point(alpha = 0.5, color = "lightblue") +
geom_smooth() +
theme_minimal() +
labs(title = "Demand for bikes is not sensitive to cloud coverage",
subtitle = "No clear relationship is shown between cloud and bike demand",
x = "Cloud Cover",
y = "Aggregate number of departures at Lincoln Memorial")
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
## Warning: Removed 1 rows containing missing values (geom_point).
#sunlight
train_data_date %>%
ggplot(aes(x = factor(sun_is_out), y = lincoln_memorial_departures)) +
geom_col(color = "lightgoldenrod1", width = 0.3) +
geom_smooth() +
theme_bw() +
facet_wrap(~month) +
labs(title = "Passengers have high preference to ride bikes during daytime",
subtitle = "While in cold seasons, the gap in demand shrinks drastically",
x = "Sunlight (sun is out = 1, otherwise = 0)",
y = "Aggregate number of departures at Lincoln Memorial")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
library(reshape2)
##
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
##
## smiths
heatmap <- train_data_date %>%
select (hour, lincoln_memorial_departures) %>%
group_by (hour) %>%
summarize (sum_rider =sum(lincoln_memorial_departures))
#reshape data
data_melt <- melt(heatmap, id= "hour")
# plot the graph
data_melt %>%
ggplot(aes(hour, variable)) +
geom_tile(aes(fill = value)) +
labs(x = NULL,
y = NULL) +
scale_fill_gradientn(colors = "blue") +
#scale_fill_gradientn(colours = rainbow(10)) +
#scale_fill_gradient(low = "white", mid = "blue", high = "black")+
scale_fill_gradient(low = "#FFFFFFFF",
high = "#012345") +
theme(legend.position = "right",
legend.direction = "vertical",
axis.line.x = element_blank(),
panel.grid.major.y = element_blank())
## Scale for 'fill' is already present. Adding another scale for 'fill', which
## will replace the existing scale.